Introduction:

Because I currently don’t have data in hand, neither do our lab members. Thus, I downloaded the COVID-19 data from WHO (https://covid19.who.int/table).This project is focuesed on the COVID-19 data in different countries from January to November in order to explore the pandemic situation. Although a lot of online sources had already done a lot of analysis of the COVID-19 data and they made good visualization plots. However, it’s a good practice for me how to manipulate data, clean data, and mutate data. In this project, line plot, barplot, facet, heatmap and animation were used.

Load in libraries

##install packages and load in libraries
library(ggplot2)
##install.packages("lubridate")
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggpubr)
library(viridis)
## Loading required package: viridisLite
library(gganimate)

1. Load in the dataset:

  1. Load in the dataset, which is downloaded from WHO (https://covid19.who.int/table). Take a glimpse at the dataset.
getwd()
## [1] "/Users/lijiaxuan/Downloads"
##load in data
 data = read.csv("WHO-COVID-19-global-data.csv")
 
 ##take a look at the data
 str(data)
## 'data.frame':    74495 obs. of  8 variables:
##  $ Date_reported    : chr  "2020-01-03" "2020-01-04" "2020-01-05" "2020-01-06" ...
##  $ Country_code     : chr  "AF" "AF" "AF" "AF" ...
##  $ Country          : chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ WHO_region       : chr  "EMRO" "EMRO" "EMRO" "EMRO" ...
##  $ New_cases        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Cumulative_cases : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ New_deaths       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Cumulative_deaths: int  0 0 0 0 0 0 0 0 0 0 ...
 glimpse(data)
## Rows: 74,495
## Columns: 8
## $ Date_reported     <chr> "2020-01-03", "2020-01-04", "2020-01-05", "2020-01-…
## $ Country_code      <chr> "AF", "AF", "AF", "AF", "AF", "AF", "AF", "AF", "AF…
## $ Country           <chr> "Afghanistan", "Afghanistan", "Afghanistan", "Afgha…
## $ WHO_region        <chr> "EMRO", "EMRO", "EMRO", "EMRO", "EMRO", "EMRO", "EM…
## $ New_cases         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Cumulative_cases  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ New_deaths        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Cumulative_deaths <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
 head(data)
##   Date_reported Country_code     Country WHO_region New_cases Cumulative_cases
## 1    2020-01-03           AF Afghanistan       EMRO         0                0
## 2    2020-01-04           AF Afghanistan       EMRO         0                0
## 3    2020-01-05           AF Afghanistan       EMRO         0                0
## 4    2020-01-06           AF Afghanistan       EMRO         0                0
## 5    2020-01-07           AF Afghanistan       EMRO         0                0
## 6    2020-01-08           AF Afghanistan       EMRO         0                0
##   New_deaths Cumulative_deaths
## 1          0                 0
## 2          0                 0
## 3          0                 0
## 4          0                 0
## 5          0                 0
## 6          0                 0
 summary(data)
##  Date_reported      Country_code         Country           WHO_region       
##  Length:74495       Length:74495       Length:74495       Length:74495      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    New_cases        Cumulative_cases     New_deaths      Cumulative_deaths
##  Min.   :-32952.0   Min.   :       0   Min.   :-514.00   Min.   :     0   
##  1st Qu.:     0.0   1st Qu.:       0   1st Qu.:   0.00   1st Qu.:     0   
##  Median :     1.0   Median :     330   Median :   0.00   Median :     6   
##  Mean   :   713.7   Mean   :   57929   Mean   :  17.46   Mean   :  1985   
##  3rd Qu.:    89.0   3rd Qu.:    7584   3rd Qu.:   1.00   3rd Qu.:   133   
##  Max.   :193734.0   Max.   :10460365   Max.   :6409.00   Max.   :241186
 class(data$Country)
## [1] "character"
 data$Country=as.factor(data$Country)
 class(data$Date_reported)
## [1] "character"
 class(data$Country_code)
## [1] "character"

2. Change the date settings

Change the date settings since it’s too intense if plot the data on every day, whcih will be used later.

##install "lubridate" package (https://rawgit.com/rstudio/cheatsheets/master/lubridate.pdf), which is specificlly designed for maniplating date related data
##install.packages("lubridate")
library(lubridate)


##change the class of Date_reported column from "character" to "Date" class type
class(data$Date_reported)
## [1] "character"
data$Date_reported = as.Date(data$Date_reported)
class(data$Date_reported)
## [1] "Date"
##change the breaks of the dates, whcih will be used later for plotting
date_breaks <- seq(as.Date("2020-01-03"), as.Date("2020-11-14"), by = "1 month" )

3. Plot the COVID-19 data from Jan to Nov in China

3.1 Make line plots

##Filter function is in the dplyr package
##library(dplyr)

##Subset the covid-19 data for China
data_china = filter(data, data$Country_code == "CN")
dim(data_china)
## [1] 317   8
head(data_china)
##   Date_reported Country_code Country WHO_region New_cases Cumulative_cases
## 1    2020-01-03           CN   China       WPRO         0                0
## 2    2020-01-04           CN   China       WPRO         1                1
## 3    2020-01-05           CN   China       WPRO         0                1
## 4    2020-01-06           CN   China       WPRO         3                4
## 5    2020-01-07           CN   China       WPRO         0                4
## 6    2020-01-08           CN   China       WPRO         0                4
##   New_deaths Cumulative_deaths
## 1          0                 0
## 2          0                 0
## 3          0                 0
## 4          0                 0
## 5          0                 0
## 6          0                 0
##Make the first line plot! 
##Plot the COVID-19 data in China from January to December.
ggplot(data_china, aes(x=Date_reported, y=Cumulative_cases)) + 
  geom_line() + 
  labs(title="Cumulative_cases in China") 

##Plot the Cumulative_cases in China from January to Novermber.
gp_china_1 = ggplot(data_china, aes(x=Date_reported, y=Cumulative_cases)) + 
  geom_point(color='lightgreen', size = 0.5) + 
  labs(title="Cumulative_cases in China") + 
  geom_line()+
  scale_x_date(breaks = date_breaks) +       
  theme(axis.text.x = element_text(angle = 90, vjust = 0.2 , hjust=0.2))
gp_china_1

##Plot the new_cases in China from January to December.
gp_china_2 = ggplot(data_china, aes(x=Date_reported, y=New_cases)) +
  geom_point(color='lightblue', size = 0.5) + 
  labs(title="New_cases in China") + 
  geom_line()+
  scale_x_date(breaks = date_breaks) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.2, hjust=0.2))
gp_china_2

##Plot the new deaths in China from January to December.
gp_china_3 = ggplot(data_china, aes(x=Date_reported, y=New_deaths)) + 
  geom_point(color='lightblue', size = 0.5) + 
  labs(title="New deaths in China") + 
  geom_line()+
  scale_x_date(breaks = date_breaks) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.2, hjust=0.2))
gp_china_3

##Plot the cumulative deaths in China from January to December.
gp_china_4 = ggplot(data_china, aes(x=Date_reported, y=Cumulative_deaths)) + 
  geom_point(color='lightgreen', size = 0.5) + 
  labs(title="Cumulative_deaths in China")  +
  geom_line()+
  scale_x_date(breaks = date_breaks) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.2, hjust=0.2))
gp_china_4

##Make multiple plots for the covid-19 data in China at the same time using a for loop
cases_cn = names(data_china[,c(5:8)])
cases_cn
## [1] "New_cases"         "Cumulative_cases"  "New_deaths"       
## [4] "Cumulative_deaths"
for (i in c(5:8)){
colnames =  data_china[,i]
print(ggplot(data_china, aes(x = Date_reported, y = colnames)) +
  geom_line(col = "red") +
  theme_classic() +
  theme(legend.position = "none",
        plot.title = element_text(hjust = 0.5)) +
  ylab(cases_cn[i-4]) ##name the y axis accordingly.
)
}

3.2 Make facet plots and heatmap

##Make a facet (https://www.datanovia.com/en/lessons/combine-multiple-ggplots-into-a-figure/) for the COVID-19 data in China
library(ggpubr)
ggarrange(gp_china_1, gp_china_2, gp_china_3, gp_china_4, ncol = 2, nrow = 2) + geom_jitter()

##Here is a heatmap for the Cumulative_cases in China.
ggplot(data_china, aes(y = Date_reported , x = Country, fill = Cumulative_cases))+
  scale_y_date(breaks = date_breaks) +  geom_tile()  + scale_fill_viridis_c() 

3.3 Make barplots

###Make barplots for data in China; 
## Overlay a line plot with a box plot for the Cumulative_cases and New_cases in China
bp_china1 = ggplot(data_china, aes(x=Date_reported, y=Cumulative_cases)) + 
  geom_bar(stat = "identity", color = "lightblue") + 
  labs(title="Cumulative_cases in China") + 
  geom_line(aes(x=Date_reported, y=Cumulative_cases, color = "Cumulative_cases")) +
  geom_line(aes(x=Date_reported, y=New_cases, color = "New_cases")) + 
  scale_color_manual(name = "COVID-19 deaths", values = c("Cumulative_cases" = "yellow", "New_cases" = "red" )) ##add the legend manually. 
bp_china1

## Overlay a line plot with a box plot for the Cumulative_deaths and New_deaths in China
bp_china2 = ggplot(data_china, aes(x=Date_reported, y=Cumulative_deaths)) + 
  geom_bar(stat = "identity", color = "lightblue") + 
  labs(title="Cumulative_deaths in China") + 
  geom_line( aes(x=Date_reported, y=Cumulative_deaths, color = "Cumulative_deaths")) + 
  geom_line(aes(x=Date_reported, y=New_deaths, color = "New_deaths")) + 
  scale_color_manual(name = "COVID-19 deaths", values = c("Cumulative_deaths" = "yellow", "New_deaths" = "red")) ##add the legend manually. 
bp_china2 

##make a facet for the barplots for the COVID-19 data in China
ggarrange(bp_china1, bp_china2, ncol = 2, nrow = 1) 

4. Plot the COVID-19 data in US from Jan to Nov

4.1 Make line plots

##Subset the covid-19 data for US
data_US = filter(data, data$Country_code == "US")

##Make a simple line plot for the Cumulative_cases in the US
ggplot(data_US, aes(x=Date_reported, y=Cumulative_cases)) + 
  geom_line(col = "red") + 
  theme_classic() +
  labs(title="Cumulative_cases in US") + 
  scale_x_date(breaks = date_breaks) + 
  theme(legend.position = "none",plot.title = element_text(hjust = 0.5)) +
  theme(axis.text.x = element_text(angle = 45, vjust = 0, hjust=0))

##Make multiple line plots for the covid-19 data in the US using a for loop
cases = names(data_US[,c(5:8)])
cases
## [1] "New_cases"         "Cumulative_cases"  "New_deaths"       
## [4] "Cumulative_deaths"
for (i in c(5:8)){
  colnames =  data_US[,i]
print(ggplot(data_US, aes(x = Date_reported, y = colnames)) +
  geom_line(col = "red") +
  theme_classic() +
  theme(legend.position = "none",
        plot.title = element_text(hjust = 0.5)) +
  ylab(cases[i-4]) ##name the y axis accordingly.
)
}

4.2 Make barplots

##Make a simple barplot for the Cumulative_cases in the US
ggplot(data_US, aes(x=Date_reported, y=Cumulative_cases)) + 
  geom_bar(stat = "identity", color = "lightblue") + 
  theme_classic() +
  labs(title="Cumulative_cases in US") + 
  scale_x_date(breaks = date_breaks) + 
  theme(legend.position = "none",plot.title = element_text(hjust = 0.5)) +
  theme(axis.text.x = element_text(angle = 45, vjust = 0, hjust=0))

##Make multiple bar plots for the covid-19 data in the US using a for loop
cases = names(data_US[,c(5:8)])
cases
## [1] "New_cases"         "Cumulative_cases"  "New_deaths"       
## [4] "Cumulative_deaths"
for (i in c(5:8)){
  colnames =  data_US[,i]
print(ggplot(data_US, aes(x = Date_reported, y = colnames)) +
  geom_bar(stat = "identity", color = "lightblue") + 
  theme_classic() +
  theme(legend.position = "none",
        plot.title = element_text(hjust = 0.5)) +
  ylab(cases[i-4]) ##name the y axis accordingly.
)
}

## 4.3 Make a heatmap

##Here is a heatmap for the Cumulative_cases in the US.
ggplot(data_US, aes(y = Date_reported , x = Country, fill = Cumulative_cases))+
  scale_y_date(breaks = date_breaks) +  geom_tile()  + scale_fill_viridis_c() 

5. Make heatmaps and factes for several selected countries.

##Make a heatmap for all the countries in the original dataset
library(viridis)
ggplot(data, aes(x = Country, y =Date_reported, fill = Cumulative_cases))+geom_tile()

library(dplyr)
##The x axis is too intense, thus I will choose 5 countries randomly from the dataset and select them according to the country code.
selected_country = filter(data, Country_code == c("CN","US","GB","FR","CA"))
head(selected_country)
##   Date_reported Country_code Country WHO_region New_cases Cumulative_cases
## 1    2020-01-03           CA  Canada       AMRO         0                0
## 2    2020-01-08           CA  Canada       AMRO         0                0
## 3    2020-01-13           CA  Canada       AMRO         0                0
## 4    2020-01-18           CA  Canada       AMRO         0                0
## 5    2020-01-23           CA  Canada       AMRO         0                0
## 6    2020-01-28           CA  Canada       AMRO         1                7
##   New_deaths Cumulative_deaths
## 1          0                 0
## 2          0                 0
## 3          0                 0
## 4          0                 0
## 5          0                 0
## 6          0                 0
###Make the heatmap to illustrate the Cumulative_cases in 5 selected countries.
ggplot(selected_country, aes(y = Date_reported , x = Country, fill = Cumulative_cases))+
  scale_y_date(breaks = date_breaks) +  geom_tile()  + scale_fill_viridis_c() + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.2, hjust=0.2))

##Make factes for several selected countries. 
  ggplot(selected_country, aes(x = Date_reported, y = Cumulative_deaths, color = "blue")) +
  geom_line()+
  theme_classic() +
  theme(legend.position = "none",
        plot.title = element_text(hjust = 0.5)) +
  facet_wrap(~Country, nrow = 2)

6. Plot a animation for the selected_country dataset

library(gganimate)

##plot a single geom_point plot for the selected_country dataset
Animation = ggplot(selected_country, aes(x = New_cases, y = Cumulative_cases,color = Country))+
  geom_point(show.legend = FALSE, alpha = 0.7, size = 2.0) +
  scale_color_viridis_d() +
  scale_size(range = c(1, 5)) 

Animation

##Transition by time
Animation + transition_time(Date_reported) + labs (title = "Date:{frame_time}") 

  ##+ scale_x_date(breaks = date_breaks)
  

##Make a facet by country
Animation + facet_wrap(~Country) + 
  transition_time(Date_reported) + 
  labs (title = "Date:{frame_time}") +
  shadow_mark(alpha = 1, size = 0.5)+
  view_follow(fixed_y = TRUE)